library(xlsx)
library(ggplot2)
library(psych)

df<-read.xlsx("dataset.xlsx", 1)
# df<-read.table("dataset.csv", header = TRUE, sep =",")
df1<-df[-which(df$exclude =="yes"),]
df1<-df1[1:248,]


A1<-t(describe(df1[which(df1$source =="mae-sot" & df1$Gender =="Male"),c(14:18,22)]))
A2<-t(describe(df1[which(df1$source =="mae-sot" & df1$Gender =="Female"),c(14:18,22)]))
A3<-t(describe(df1[which(df1$source !="mae-sot" & df1$Gender =="Male"),c(14:18,22)]))
A4<-t(describe(df1[which(df1$source !="mae-sot" & df1$Gender =="Female"),c(14:18,22)]))

cbc.stats<-rbind(A1,A2,A3,A4)
write.csv(cbc.stats,"cbc.stats.csv")

#############################################
A1<-t(describe(df1[which(df1$Gender =="Male"),c(14:18,22)]))
A2<-t(describe(df1[which(df1$Gender =="Female"),c(14:18,22)]))

cbc.stats<-rbind(A1,A2)
write.csv(cbc.stats,"cbc.stats.combined.csv")


B1 <- aov(df1[,14] ~ df1$Gender)
B2 <- aov(df1[,15] ~ df1$Gender)
B3 <- aov(df1[,16] ~ df1$Gender)
B4 <- aov(df1[,17] ~ df1$Gender)
B5 <- aov(df1[,18] ~ df1$Gender)
B6 <- aov(df1[,22] ~ df1$Gender)

C1<-TukeyHSD(B1)
C2<-TukeyHSD(B2)
C3<-TukeyHSD(B3)
C4<-TukeyHSD(B4)
C5<-TukeyHSD(B5)
C6<-TukeyHSD(B6)

anova.out<-rbind(C1[[1]],C2[[1]],C3[[1]],C4[[1]],C5[[1]],C6[[1]])
write.csv(anova.out,"anova.cbc.csv")

#df1$mutation[which(df1$mutation =="A+")] <-"WT"


phenotype<-array(NA,nrow(df1))
idx_A <-which(df1$mutation =="A+")
idx_wt<-which(df1$mutation =="WT")
idx_mut<-intersect(which(df1$mutation !="WT") , which(df1$mutation !="A+"))
# idx_mut<-which(df1$mutation !="WT")  which(df1$mutation !="A+")
phenotype[idx_A]<-"A+"
phenotype[idx_wt]<-"WT"
phenotype[idx_mut]<-"deficient mutation"

df1$phenotype <-phenotype
df1$zygosity[172] <- "homozygous"
#df1<-df1[1:148,]
df1$type<-paste(df1$zygosity,df1$phenotype)


p1 <- ggplot(df1, aes(phenotype, X.BC))
p1 <- p1 + geom_boxplot(outlier.colour=NA)
p1<-p1 + geom_point(aes(shape = factor(country)),position = position_jitter(width=0.2)) + xlab("G6PD mutation") + ylab("% bright cells") + facet_grid(.~Gender + zygosity,scales="free",space="free")+
  theme(legend.position="top") +
  theme(legend.title=element_blank())

p2 <- ggplot(df1, aes(phenotype, G6PD_HGB))
p2 <- p2 + geom_boxplot(outlier.colour=NA)
p2<-p2 + geom_point(aes(shape = factor(country)),position = position_jitter(width=0.2)) + xlab("G6PD mutation") + ylab("G6PD activity U/gHb") + facet_grid(.~Gender + zygosity,scales="free",space="free")+
  theme(legend.position="top") +
  theme(legend.title=element_blank())

p3 <- ggplot(df1, aes(phenotype, G6PD_RBC))
p3 <- p3 + geom_boxplot(outlier.colour=NA)
p3<-p3 + geom_point(aes(shape = factor(country)),position = position_jitter(width=0.2)) + xlab("G6PD mutation") + ylab("G6PD activity U/RBC") + facet_grid(.~Gender + zygosity,scales="free",space="free")+
  theme(legend.position="top") +
  theme(legend.title=element_blank())
# 
# 
# 
# 
# p1 <- ggplot(df1, aes(mutation, X.BC))
# p1 <- p1 + geom_boxplot(outlier.colour=NA)
# p1<-p1 + geom_point(aes(shape = factor(country)),position = position_jitter(width=0.2)) + xlab("G6PD mutation") + ylab("% bright cells") + facet_grid(.~Gender + zygosity,scales="free",space="free")+
#   theme(legend.position="top") +
#   theme(legend.title=element_blank())
# 
# p2 <- ggplot(df1, aes(mutation, G6PD_HGB))
# p2 <- p2 + geom_boxplot(outlier.colour=NA)
# p2<-p2 + geom_point(aes(shape = factor(country)),position = position_jitter(width=0.2)) + xlab("G6PD mutation") + ylab("G6PD activity U/gHb") + facet_grid(.~Gender + zygosity,scales="free",space="free")+
#   theme(legend.position="top") +
#   theme(legend.title=element_blank())
# 
# p3 <- ggplot(df1, aes(mutation, G6PD_RBC))
# p3 <- p3 + geom_boxplot(outlier.colour=NA)
# p3<-p3 + geom_point(aes(shape = factor(country)),position = position_jitter(width=0.2)) + xlab("G6PD mutation") + ylab("G6PD activity U/RBC") + facet_grid(.~Gender + zygosity,scales="free",space="free")+
#   theme(legend.position="top") +
#   theme(legend.title=element_blank())

library(grid)
library(gridExtra)
grid.arrange(p1, p2,p3, ncol = 1)

idx_us<-which(df1$country=="United States")
idx_th<-which(df1$country=="Thailand")

# ols_us<-lm(df1$X.BC ~df1$G6PD_HGB)
# ols2<-lm(df1$X.BC ~df1$G6PD_RBC)
df1$HB_status<- array("Normal", nrow(df1))
df1$HB_status[which(df1$HGB<=11.5)]<-"Anemic"


p1<-ggplot(df1, aes(y=X.BC, x=G6PD_HGB,color = factor(HB_status))) + geom_point(aes(shape = factor(Gender))) +
  facet_grid(.~country ,scales="free",space="free") + xlab("G6PD activity U/gHb" )+
  ylab("% bright cells")+theme(legend.title=element_blank()) 
  #geom_abline(intercept = ols$coefficients[1], slope = ols$coefficients[2]) +
  #geom_text()


p2<-ggplot(df1, aes(y=X.BC, x=G6PD_RBC,color = factor(HB_status))) + geom_point(aes(shape = factor(Gender))) +
  facet_grid(.~country ,scales="free",space="free") + xlab("G6PD activity U/RBC" )+
  ylab("% bright cells")+theme(legend.title=element_blank())
#geom_abline(intercept = ols2$coefficients[1], slope = ols2$coefficients[2])

grid.arrange(p1, p2, ncol = 2)

####  blad altman


library(plyr)

df2<-df1[which(df1$zygosity =="heterozygous"),]

mean(df2$X.BC -df2$G6PD_HGB)
lin.fit<-lm(df2$G6PD_HGB ~df2$X.BC)
df2$residuals<- lin.fit$residuals
df2$means<-rowMeans(cbind(df2$X.BC,df2$G6PD_HGB))
sdv1<-sd(df2$residuals)

p1<-ggplot(df2, aes(x=means,y=residuals)) +geom_point() + geom_hline(aes(yintercept=0))+
geom_hline(aes(yintercept= 2*sdv1),col ="red")+geom_hline(aes(yintercept= -2*sdv1),col ="red")+
  facet_grid(.~country ,scales="free",space="free") + xlab("means of % bright cells and G6PD activity U/gHb") +
  ylab("residuals from best linear fit % bright cells and G6PD U/gHb")

df2_2<-df1[which(df1$zygosity =="heterozygous"),]

mean(df2_2$X.BC -df2_2$G6PD_RBC)
lin.fit<-lm(df2_2$G6PD_RBC ~df2_2$X.BC)
df2_2$residuals<- lin.fit$residuals
df2_2$means<-rowMeans(cbind(df2_2$X.BC,df2_2$G6PD_RBC))
sdv2<-sd(df2_2$residuals)

p2<-ggplot(df2_2, aes(x=means,y=residuals)) +geom_point() + geom_hline(aes(yintercept=0))+
  geom_hline(aes(yintercept= 2*sdv2),col ="red")+geom_hline(aes(yintercept= -2*sdv2),col ="red")+
  facet_grid(.~country ,scales="free",space="free") + xlab("means of % bright cells and G6PD activity U/RBC") +
  ylab("residuals from best linear fit % bright cells and G6PD activity U/RBC")

grid.arrange(p1, p2, ncol = 1)